{
 "metadata": {
  "name": "",
  "signature": "sha256:f3b29b33f8a599bac09dfa7a1937f16c7d0e39467ad5f8b810103b379c032505"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Exercises for \"Hands-on with Pydata: How to Build a Minimal Recommendation Engine\""
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Systems check: imports and files"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Numpy Questions: Indexing\n",
      "## 1. Access an individual element in a NumPy array"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, access the its third element\n",
      "arr = np.arange(10)\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Access the last column of a 2d array"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, access its last column\n",
      "arr = np.array([[5,4,2,5],[4,5,1,12],[0,1,5,4]])\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 3. Select all elements from a 2d array that are larger than zero"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, obtain all elements that are larger than zero\n",
      "arr = np.array([[-0.28179535,  1.80896278, -1.08991099, -1.20264003,  0.61651465],\n",
      "                [ 0.49983669,  0.28402664, -0.12685554,  0.81266623,  0.96586634]])\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 4. Set all negative values of an array to 1"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, set the last two elements to 10\n",
      "arr = np.array([1,2,-10,5,-6])\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Numpy Questions: Operations"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 1. Compute the sum of a 1D array"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, compute its sum\n",
      "arr = np.arange(10)\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Compute the mean of a 1D array"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, compute its mean\n",
      "arr = np.array([50,-79,80,35])\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 3. How do you detect the presence of NANs in an array?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following ndarray, detect all elements that are nans\n",
      "arr = np.array([np.nan] * 10)\n",
      "arr[2:4] = 5\n",
      "arr"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Pandas questions: Series and DataFrames\n",
      "## 1. Adding a column in a DataFrame"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following DataFrame, add a new column to it\n",
      "df = pd.DataFrame({'col1': [1,2,3,4]})\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Deleting a row in a DataFrame"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following DataFrame, delete row 'd' from it\n",
      "df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 3. Creating a DataFrame from a few Series"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following three Series, create a DataFrame such that it holds them in its columns\n",
      "ser_1 = pd.Series(np.random.randn(6))\n",
      "ser_2 = pd.Series(np.random.randn(6))\n",
      "ser_3 = pd.Series(np.random.randn(6))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Pandas questions: Indexing"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 1. Indexing into a specific column"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given the following DataFrame, try to index into the 'col_2' column\n",
      "df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},\n",
      "                  columns=['col_1', 'col_2', 'col_3'],\n",
      "                  index=['obs1', 'obs2', 'obs3', 'obs4'])\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Label-based indexing"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# using the same DataFrame, index into the row whose index is 'obs_3'\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Reco systems questions: Data Loading\n",
      "## 1. How to load the `users` and `movies` portions of MovieLens"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "\n",
      "users = pd.read_table('data/ml-1m/users.dat',\n",
      "                      sep='::', header=None,\n",
      "                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])\n",
      "\n",
      "movies = pd.read_table('data/ml-1m/movies.dat',\n",
      "                       sep='::', header=None,\n",
      "                       names=['movie_id', 'title', 'genres'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. How to load the training and testing subsets"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# subset version (hosted notebook)\n",
      "movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0)\n",
      "movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "movielens_train.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "movielens_test.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Reco systems questions: Estimation Functions"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 1. Simple content filtering using mean ratings"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# write an 'estimate' function that computes the mean rating of a particular user\n",
      "def estimate1(user_id, movie_id):\n",
      "    # first, index into all ratings by this user\n",
      "    # second, compute the mean of those ratings\n",
      "    # for now, we'll just return None\n",
      "    return None\n",
      "\n",
      "\n",
      "# try it out for a user_id, movie_id pair\n",
      "estimate1(4653, 2648)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Simple collaborative filtering using mean ratings"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# write an 'estimate' function that computes the mean rating of a particular user\n",
      "def estimate2(user_id, movie_id):\n",
      "    # first, index into all ratings of this movie\n",
      "    # second, compute the mean of those ratings\n",
      "    # for now, we'll just return None\n",
      "    return None\n",
      "\n",
      "    \n",
      "# try it out for a user_id, movie_id pair\n",
      "estimate2(4653, 2648)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Mini-Challenge\n",
      "\n",
      "These are the two functions that you will need to test your `estimate` method."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def compute_rmse(y_pred, y_true):\n",
      "    \"\"\" Compute Root Mean Squared Error. \"\"\"\n",
      "    \n",
      "    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def evaluate(estimate_f):\n",
      "    \"\"\" RMSE-based predictive performance evaluation with pandas. \"\"\"\n",
      "    \n",
      "    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)\n",
      "    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])\n",
      "    real = movielens_test.rating.values\n",
      "    return compute_rmse(estimated, real)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# write your estimate function here\n",
      "def my_estimate_func(user_id, movie_id):\n",
      "    return 3.0"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "With those, you can test for performance with the following line, which assumes that your function is called `my_estimate_func`:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print 'RMSE for my estimate function: %s' % evaluate(my_estimate_func)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Once you are happy with your score, you can submit your RMSE by running this function (in the hosted notebook only):"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from update_score import update_score\n",
      "update_score(evaluate(my_estimate_func))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}